Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

Keep DE genes

datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)

print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"




Define a gene co-expression similarity

Using Biweight midcorrelation because it’s more robust to outliers than regular correlation or Mutual Information score

allowWGCNAThreads()
## Allowing multi-threading with up to 4 threads.
# MAD = median absolute deviation
cor_mat = datExpr %>% t %>% bicor

Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\). Two methods are proposed: \(s_{ij}=|bw(i,j)|\) and \(s_{ij}=\frac{1+bw(i,j)}{2}\)

-Using \(s_{ij}=\frac{1+bw(i,j)}{2}\), the strongest negative correlations (-1) get mapped to 0 (no correlation) and the zero correlated genes get mapped to the average correlation (0.5), which I don’t think makes much sense

-Using \(s_{ij}=|bw(i,j)|\) we lose the direction of the correlation, but at least we maintain the magnitude of the correlation of all the genes. Decided to use this one

S = abs(cor_mat)




Define a family of adjacency functions

Chose power adjacency function over the sigmoid function because it has only one parameter to adjust and both methods are supposed to lead to very similar results if the parameters are chosen with the scale-free topology criterion.

Choosing a parameter value

Following the scale-free topology criterion because metabolic networks have been found to display approximate scale free topology

  1. Only consider those parameter values that lead to a network satisfying scale-free topology at least approximately, e.g. signed \(R^2 > 0.80\)
best_power = datExpr %>% t %>% pickSoftThreshold(powerVector = 1:15, RsquaredCut=0.8)
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k.  max.k.
## 1      1  0.00342  0.261          0.988 775.000  770.0000 1330.00
## 2      2  0.24500 -1.490          0.986 257.000  247.0000  636.00
## 3      3  0.58100 -2.220          0.963  98.200   90.3000  331.00
## 4      4  0.74900 -2.510          0.990  41.600   36.6000  186.00
## 5      5  0.83600 -2.560          0.990  19.200   15.8000  110.00
## 6      6  0.87300 -2.590          0.994   9.460    7.2500   67.80
## 7      7  0.90000 -2.520          0.994   4.960    3.4900   43.50
## 8      8  0.90800 -2.440          0.988   2.740    1.7600   28.80
## 9      9  0.88700 -2.380          0.951   1.590    0.9140   19.70
## 10    10  0.39100 -3.380          0.358   0.960    0.4870   13.70
## 11    11  0.38500 -3.210          0.349   0.603    0.2660    9.78
## 12    12  0.93500 -1.980          0.982   0.391    0.1490    7.11
## 13    13  0.95700 -1.830          0.985   0.262    0.0850    5.25
## 14    14  0.93300 -1.770          0.941   0.180    0.0495    3.98
## 15    15  0.94200 -1.780          0.964   0.126    0.0295    3.36
print(paste0('Best power for scale free topology: ', best_power$powerEstimate))
## [1] "Best power for scale free topology: 5"

Elevate the matrix to the suggested power

S_sft = S^best_power$powerEstimate



Additional considerations

  1. The mean connectivity should be high so that the network contains enough information

What does high mean? using the power=5 we get a mean connectivity of 20, is this high?

mean_connectivity = matrix(0, 15, 2) %>% data.frame
colnames(mean_connectivity) = c('power','mean_connectivity')
for(i in 1:15) mean_connectivity[i,] = c(i, mean(colSums(S^i)))

ggplotly(mean_connectivity %>% ggplot(aes(power, mean_connectivity)) + ylab('mean connectivity') +
         geom_vline(xintercept=best_power$powerEstimate, color='gray') + geom_point(color='#0099cc') + 
         scale_y_sqrt() + theme_minimal())
rm(mean_connectivity, best_power)
  1. The slope \(-\hat{\gamma}\) of the regression line between \(log_{10}(p(k))\) and \(log_{10}(k)\) should be around -1

Slope is 2.6 times as steep as it should be (maybe because the range of k is too narrow?)

k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
       dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))

lm(log10(p_k) ~ log10(k), data=k_df)
## 
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
## 
## Coefficients:
## (Intercept)     log10(k)  
##       2.687       -2.670
rm(k_df)

Exploratory analysis of scale free topology adjacency matrix

  • Degree distribution: the scale-free topology adjacency matrix does have an exponential behaviour in the degree distribution which wasn’t there on the original matrix

Note: The slope is very steep, both axis are on sqrt scale

plot_data = data.frame('n'=1:nrow(S)^2, 'S'=sort(melt(S)$value), 'S_sft'=sort(melt(S_sft)$value))

plot_data %>% filter(n%%100==0) %>% dplyr::select(S, S_sft) %>% melt %>% 
              ggplot(aes(value, color=variable, fill=variable)) + geom_density(alpha=0.5) + 
              xlab('k') + ylab('p(k)') + scale_x_sqrt() + scale_y_sqrt() + theme_minimal()

rm(plot_data)
  • “To visually inspect whether approximate scale-free topology is satisfied, one plots log 10 (p(k)) versus log 10 (k). A straight line is indicative of scale-free topology”

The example given in the article has a curve similar to this one and they say it’s OK

k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
      dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))

# k_df %>% ggplot(aes(k,p_k)) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') + 
#          ylab('p(k)') + scale_x_log10() + scale_y_log10() + theme_minimal()

k_df %>% ggplot(aes(log10(k),log10(p_k))) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') + 
         ylab('p(k)') + theme_minimal()

  • “To measure how well a network satisfies a scale-free topology, we propose to use the square of the correlation between log10(p(k)) and log10(k), i.e. the model fitting index \(R^2\) of the linear model that regresses log10(p(k)) on log10(k)”

\(R^2=0.83\) The \(R^2\) we got from the pickSoftThreshold function

lm(log10(p_k) ~ log10(k), data=k_df) %>% summary
## 
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6508 -0.3561  0.1792  0.2841  0.4629 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6865     0.6685   4.019 0.003848 ** 
## log10(k)     -2.6703     0.3955  -6.751 0.000145 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4222 on 8 degrees of freedom
## Multiple R-squared:  0.8507, Adjusted R-squared:  0.832 
## F-statistic: 45.58 on 1 and 8 DF,  p-value: 0.0001449
rm(k_df)




Defining a measure of node dissimilarity

Using topological overlap dissimilarity measure because it has been found to result in biologically meaningful modules

Create Topological Overlap Matrix (TOM)

1st quartile is already 0.9852, most of the genes are very dissimilar

TOM = S_sft %>% TOMsimilarity
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM

rownames(dissTOM) = rownames(S_sft)
colnames(dissTOM) = colnames(S_sft)

dissTOM %>% melt %>% summary
##               Var1                      Var2             value       
##  ENSG00000000971:   3000   ENSG00000000971:   3000   Min.   :0.0000  
##  ENSG00000001084:   3000   ENSG00000001084:   3000   1st Qu.:0.9852  
##  ENSG00000001626:   3000   ENSG00000001626:   3000   Median :0.9918  
##  ENSG00000002586:   3000   ENSG00000002586:   3000   Mean   :0.9882  
##  ENSG00000002746:   3000   ENSG00000002746:   3000   3rd Qu.:0.9956  
##  ENSG00000002933:   3000   ENSG00000002933:   3000   Max.   :0.9999  
##  (Other)        :8982000   (Other)        :8982000




Identifying gene modules

Using hierarchical clustering on the TOM-based dissimilarity matrix

Using average linkage (following the paper)

In average linkage hierarchical clustering, the distance between two clusters is defined as the average distance between each point in one cluster to every point in the other cluster.

It looks like instead of finding clusters, on each new separation this method discards outliers

dend = dissTOM %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Instead of using a fixed height barch to cut the dendrogram into clusters, using a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

Two available methods:

  1. Dynamic Tree Cut: top-down algorithm relying only on the dendrogram and respecting the order of the clustered objects on it. This method is less sensitive to parameter choice but also less flexible (I think I prefer robustness over flexibility, so I’m going to use this one)

  2. Dynamic Hybrid Cut: builds the clusters from bottom up. In addition to information from the dendrogram, it utilizes dissimilarity information among the objects. Seems to me that relies on too many heuristics and has too many parameters to tune

Plus a post processing step:

  • deepSplit: Controls the sensitivity of the algorithm to cluster splits. In Dynamic Tree it controls whether, after recursively processing all clusters, the algorithm should stop or whether it should re-process all clusters until there are no new clusters detected. I’ll do the simple version first and don’t re-process the clusters.
modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['avg_linkage']] = modules

Merging modules with similar expression profiles

“One can relate modules to each other by correlating the corresponding module eigengenes (Horvath et al., 2005). If two modules are highly correlated, one may want to merge them”

Calculate the “eigengenes” (1st principal component) of each module

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merge similar modules

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=2)(100)))

# Two main modules
top_modules = modules %>% replace(modules %in% c(3, 5, 6, 9, 10, 11, 12, 13, 16, 17, 18), 1) %>%
                          replace(modules %in% c(1, 2, 4, 7, 8, 14, 15, 19, 20, 21, 22), 2)
clusterings[['avg_linkage_top_clusters']] = top_modules

# Closest modules
merged_modules = modules %>% replace(modules %in% c(3,5, 9, 12), 3) %>% replace(modules %in% c(13, 17), 13)
clusterings[['avg_linkage_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, dend, modules, module_colors, MEs_output, top_modules, merged_modules, top_module_colors, dend_colors)



Using complete linkage

In complete linkage hierarchical clustering, the distance between two clusters is defined as the longest distance between two points in each cluster.

dend = dissTOM %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)

modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)

clusterings[['comp_linkage']] = modules

Merging modules with similar expression profiles

module_colors = c('gray', viridis(max(modules)))

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=2)(100)))

# Two main modules
top_modules = modules %>% replace(modules %in% c(1, 3, 6, 14), 1) %>%
                          replace(!modules %in% c(1, 3, 6, 14), 2) %>%
                          replace(modules==0, 0)
clusterings[['comp_linkage_top_clusters']] = top_modules

# Closest modules
merged_modules = modules %>% replace(modules %in% c(11, 22), 11) %>%
                             replace(modules %in% c(15, 24), 15) %>%  
                             replace(modules %in% c(3, 6), 3) %>%
                             replace(modules %in% c(4, 13), 4) %>%
                             replace(modules %in% c(12, 18, 20, 25), 12)
clusterings[['comp_linkage_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = module_colors[modules[dend$order]+1],
                         'MergedModules' = module_colors[merged_modules[dend$order]+1],
                         'TopModules' = top_module_colors[top_modules[dend$order]+1])

dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])

rm(MEs, dend, modules, module_colors, MEs_output, top_modules, merged_modules, top_module_colors, dend_colors)




Exploratory analysis of clustering

Adjusted Rand Index comparison

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = sub(0, NA, clusterings[[i]]) %>% as.factor
  for(j in (i):length(clusterings)){
    cluster2 = sub(0, NA, clusterings[[j]]) %>% as.factor
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(12,12))

rm(i, j, cluster1, cluster2, cluster_sim)

PCA

pca = datExpr %>% prcomp

plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
                       'avg_linkage' = sub(0,NA,clusterings[['avg_linkage']]) %>% as.factor, 
                       'avg_linkage_merged' = sub(0,NA,clusterings[['avg_linkage_merged']]) %>% as.factor, 
                       'avg_linkage_top_clusters' = sub(0,NA,clusterings[['avg_linkage_top_clusters']]) %>% as.factor, 
                       'comp_linkage' = sub(0,NA,clusterings[['comp_linkage']]) %>% as.factor, 
                       'comp_linkage_merged' = sub(0,NA,clusterings[['comp_linkage_merged']]) %>% as.factor, 
                       'comp_linkage_top_clusters' = sub(0,NA,clusterings[['comp_linkage_top_clusters']]) %>% as.factor)

selectable_scatter_plot(plot_data[,-1,], plot_data[,-1])
rm(pca, plot_data)